/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.subcommands;

import java.util.Arrays;
import java.util.List;

import mosdi.util.Alphabet;
import mosdi.util.Iupac;
import mosdi.util.Log;
import mosdi.util.NamedSequence;
import mosdi.util.ProductEncoder;
import mosdi.util.SequenceUtils;

public class CountQgramsSubcommand extends Subcommand {
	
	@Override
	public String usage() {
		return
		super.usage()+" [options] <fasta-file> <q>\n" +
		"\n" +
		"Options:\n" +
		"  -r: also count q-grams on reverse strand, i.e. both counts (forward and reverse)\n" +
		"      are summed up.\n" +
		"  -p <pseudocounts>: add pseudo counts to each q-gram count (default: 0.0). May be\n" +
		"                     any floating point number.\n" +
		"  -A <alphabet>: if either \"dna\" or \"protein\", the respective alphabet of\n" +
		"                 nucleotides or amino acids is used, respectively. Otherwise,\n" +
		"                 an alphabet consisting of the given characters is used (default:\"dna\").";
	}
	
	@Override
	public String description() {
		return "Counts q-grams in a given set of sequences.";
	}

	@Override
	public String name() {
		return "count-qgrams";
	}

	private static String printDouble(double d) {
		boolean isIntLike = (d<=Long.MAX_VALUE) && (d>=Long.MIN_VALUE) && (d==(double)((long)d));
		if (isIntLike) {
			return Long.toString((long)d);
		} else {
			return Double.toString(d);
		}
	}
	
	@Override
	public int run(String[] args) {
		parseOptions(args, 2, "rp:A:");

		// Option dependencies
		exclusiveOptions("A", "r");

		// Mandatory arguments
		String filename = getStringArgument(0);
		int q = getIntArgument(1);

		// Options
		boolean considerReverse = getBooleanOption("r", false);
		double pseudoCount = getDoubleOption("p", 0.0);
		String alphabetParameter = getStringOption("A", "dna");

		Alphabet alphabet;
		if (alphabetParameter.equals("dna")) {
			alphabet = Alphabet.getDnaAlphabet();
		} else if (alphabetParameter.equals("protein")) {
			alphabet = Alphabet.getAminoAcidAlphabet();
		} else {
			alphabet = new Alphabet(alphabetParameter);
		}

		List<NamedSequence> sequences = null;
		try {
			sequences = SequenceUtils.readFastaFile(filename, alphabet, true);
		} catch (Exception e) {
			Log.errorln(e.toString());
			System.exit(1);
		}
		
		// count q-grams
		double[] qgramCounts = new double[(int)Math.pow(alphabet.size(), q)];
		Arrays.fill(qgramCounts, pseudoCount);
		for (NamedSequence ns : sequences) {
			SequenceUtils.countQGrams(q, alphabet.size(), ns.getSequence(), qgramCounts); 
			if (considerReverse) {
				int[] reverse = Iupac.dnaReverseComplementary(ns.getSequence());
				SequenceUtils.countQGrams(q, alphabet.size(), reverse , qgramCounts);
			}
		}
		ProductEncoder pe = SequenceUtils.qGramEncoder(q, alphabet.size());
		for (int qgram=0; qgram<qgramCounts.length; ++qgram) {
			Log.printf(Log.Level.STANDARD, "%s \t%s%n", alphabet.buildString(pe.decode(qgram)), printDouble(qgramCounts[qgram]));
		}
		return 0;
	}
}
